import jieba
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence


# 获取停用词集合
stopwords = []
with open('stopwords.txt', 'r', encoding='utf-8') as fr:
    for line in fr.readlines():
        stopwords.append(line.strip())

# 使用 jieba 工具进行分词
with open('exp4_corpus.txt', 'r', encoding='utf-8') as fr:  # 打开语料文本
    with open('seg_output.txt', 'w', encoding='utf-8') as fw:  # 分词结果保存
        for line in fr.readlines():  # 每一行进行操作
            words = jieba.cut(line.strip(), cut_all=False)  # 分词
            words = [word for word in words if word not in stopwords]  # 去除停用词
            fw.write(' '.join(words) + '\n')  # 分词用空格隔开，保存到文件中

# 使用 gensim 中的 Word2Vec-CBOW 模型训练词向量
model = Word2Vec(
    LineSentence(open('seg_output.txt', 'r', encoding='utf-8')), vector_size=100, min_count=1, window=5, workers=4)  # sg=0, 使用CBOW
model.wv.save_word2vec_format('word2vec.vector', binary=False)  # 词向量保存
model.save('word2vec.model')  # 模型保存

